library(CrossClustering)
library(GEOquery)
library(limma)
library(cluster)
library(mclust)

rm(list=ls())

if (!require("BiocManager", quietly = TRUE))
  install.packages("BiocManager")

BiocManager::install("GEOquery")

# load data
gset <- getGEO("GSE38888", GSEMatrix =TRUE)
if (length(gset) > 1) idx <- grep("GPL14965", attr(gset, "names")) else idx <- 1
gset <- gset[[idx]]
ex <- exprs(gset)

# Euclidean distance
d_euc <- dist(t(ex), method="euclidean")

phenotData <- pData(gset)

tClass <- phenotData$characteristics_ch1.2 #2
names(tClass) <- rownames(phenotData)
table(tClass)

# true groups
true_part <- numeric()
for (i in 1:length(table(tClass))){
  true_part[tClass==names(table(tClass))[i]]=i
}

# Cross-Clustering 

CC_euc <- cc_crossclustering(
  dist=d_euc,
  k_w_min = 2,
  k_w_max = 19,
  k2_max = 20,
  out = F,
  method = "complete"
)

CrossClustering::ari(table(
  CrossClustering::cc_get_cluster(CC_euc),
  true_part), 
  alpha = 0.05, digits = 2) # ARI=0.63

# comparison with Ward & Complete-linkage
clusto <- function(x, distMat){
  cutree(hclust(distMat, method="ward.D"), k=x)
}

clucomp <- function(x, distMat){
  cutree(hclust(distMat, method="complete"), k=x)
}

members_euc.ward <- sapply(2:20, clusto, distMat=d_euc)
members_euc.comp <- sapply(2:20, clucomp, distMat=d_euc)

Sil_euc.ward <- Sil_euc.complete <- numeric(19)

for (c in 1:ncol(members_euc.ward)) {
  Sil_euc.ward[c] <- summary(silhouette(x=members_euc.ward[,c],
                                        dist=d_euc))$avg.width
  Sil_euc.complete[c] <- summary(silhouette(x=members_euc.comp[,c],
                                            dist=d_euc))$avg.width
}

names(Sil_euc.ward) <- names(Sil_euc.complete) <- paste("Clus", 2:20, sep="")

# Average Silhouette Width maximum
groupsGE_ward_euc <- members_euc.ward[,which.max(Sil_euc.ward)] # 11 clusters
groupsGE_comp_euc <- members_euc.comp[,which.max(Sil_euc.complete)] # 11 clusters

n=dim(ex)[2] # n. obs

CrossClustering::ari(table(
  groupsGE_ward_euc,
  true_part), 
  alpha = 0.05, digits = 2) # ARI=0.09

CrossClustering::ari(table(
  groupsGE_comp_euc,
  true_part), 
  alpha = 0.05, digits = 2) # ARI=0.1

# dendrogram
my.clu_ward <- hclust(d_euc, method="ward.D")
my.clu_comp <- hclust(d_euc, method="complete")

source("http://addictedtor.free.fr/packages/A2R/lastVersion/R/code.R")

Subtypes=substring(tClass, first=19, last=1000000L)
Subtypes[Subtypes==" luminal"] <- "Luminal"
Subtypes[Subtypes==" Triple Negative"] <- "TN"

# Figure S123
A2Rplot(my.clu_ward,
        k=11,
        fact.sup=Subtypes,
        boxes = FALSE,
        col.up = "gray",
        col.down = 1:11, show.labels=FALSE, main="Ward")

# Figure S124
A2Rplot(my.clu_comp,
        k=11,
        fact.sup=Subtypes,
        boxes = FALSE,
        col.up = "gray",
        col.down = 1:11, show.labels=FALSE, main="Complete")

#### K-means

km1 <- km2 <- km3 <- km4 <- km5 <- km6 <- km7 <- km8 <- km9 <- km10 <- matrix(NA, ncol=19, nrow=n)
SilKm1 <- SilKm2 <- SilKm3 <- SilKm4 <- SilKm5 <- SilKm6 <- SilKm7 <- SilKm8 <- SilKm9 <- SilKm10 <- numeric(19)

data <- t(ex) 
d <- d_euc

# ran the algorithm 10 times, every time choosing the k in the interval [2, 20] that maximizes the ASW
for (k in 2:20){
  km1[,k-1] <- kmeans(data, c=k)$cluster
  SilKm1[k-1] <- mean(silhouette(as.numeric(km1[,k-1]), dist=d)[,3])
  
  km2[,k-1] <- kmeans(data, c=k)$cluster
  SilKm2[k-1] <- mean(silhouette(as.numeric(km2[,k-1]), dist=d)[,3])
  
  km3[,k-1] <- kmeans(data, c=k)$cluster
  SilKm3[k-1] <- mean(silhouette(as.numeric(km3[,k-1]), dist=d)[,3])
  
  km4[,k-1] <- kmeans(data, c=k)$cluster
  SilKm4[k-1] <- mean(silhouette(as.numeric(km4[,k-1]), dist=d)[,3])
  
  km5[,k-1] <- kmeans(data, c=k)$cluster
  SilKm5[k-1] <- mean(silhouette(as.numeric(km5[,k-1]), dist=d)[,3])
  
  km6[,k-1] <- kmeans(data, c=k)$cluster
  SilKm6[k-1] <- mean(silhouette(as.numeric(km6[,k-1]), dist=d)[,3])
  
  km7[,k-1] <- kmeans(data, c=k)$cluster
  SilKm7[k-1] <- mean(silhouette(as.numeric(km7[,k-1]), dist=d)[,3])
  
  km8[,k-1] <- kmeans(data, c=k)$cluster
  SilKm8[k-1] <- mean(silhouette(as.numeric(km8[,k-1]), dist=d)[,3])
  
  km9[,k-1] <- kmeans(data, c=k)$cluster
  SilKm9[k-1] <- mean(silhouette(as.numeric(km9[,k-1]), dist=d)[,3])
  
  km10[,k-1] <- kmeans(data, c=k)$cluster
  SilKm10[k-1] <- mean(silhouette(as.numeric(km10[,k-1]), dist=d)[,3])
}

lista_km <- list(km1, km2, km3, km4, km5, km6, km7, km8, km9, km10)
lista_Silkm <- list(SilKm1, SilKm2, SilKm3, SilKm4, SilKm5, SilKm6,
                    SilKm7, SilKm8, SilKm9, SilKm10)

CrossClustering::ari(table(
  lista_km[[which.max(sapply(lista_Silkm, max))]][,sapply(lista_Silkm, which.max)[which.max(sapply(lista_Silkm, max))]],
  true_part), 
  alpha = 0.05, digits = 2) # ARI=0.04

## SOM
library(kohonen)

# we applied SOM with 70 combinations of dimensions

griglia <- matrix(NA, ncol=3, nrow=70)
griglia[,1] <- c(rep(1, 19), rep(2, 10), rep(3, 7), rep(4, 5), rep(5, 4), rep(6, 4), rep(7, 3), rep(8, 3), rep(9, 3), rep(10, 2), 11:20)
griglia[,2] <- c(2:20, 1:10, 1:7, 1:5, 1:4, 1:4, 1:3, 1:3, 1:3, 1:2, rep(1, 10))
griglia[,3]  <- griglia[,1]*griglia[,2]

my.som <- matrix(NA, nrow=n, ncol=nrow(griglia))

set.seed(123)
for (c in 1:nrow(griglia)){
  my.som[,c] <- som(data, grid=somgrid(xdim=griglia[c,1], ydim=griglia[c,2],
                                       "rectangular"))$unit.classif
}

SilSom <- numeric(ncol(my.som))
for (c in 1:ncol(my.som)){
  SilSom[c] <- mean(silhouette(as.numeric(my.som[,c]), dist=d)[,3])
}

som <- matrix(NA, nrow=n, ncol=10)
colnames(som) <- paste("Prova", 1:10, sep="")
for (c in 1:10){
  som[,c] <- som(data, grid=somgrid(xdim=9, ydim=2, "rectangular"))$unit.classif
}

sil_som <- numeric(10)
for (c in 1:10){
  sil_som[c] <- mean(silhouette(as.numeric(som[,c]), dist=d)[,3])
}

CrossClustering::ari(table(
  som[,which.max(sil_som)],
  true_part), 
  alpha = 0.05, digits = 2) # ARI=0.1

# DBSCAN
library(RANN)
library(fpc)
nearest3 <- nn2(data=t(ex), k=3)
plot(nearest3[[2]][order(nearest3[[2]][,3], decreasing=T),3], type="l", axes=F,
     ylab="2-dist measure", xlab="points")
axis(2)
axis(1, at=seq(1, 91, by=1))
abline(v=3, col="red")
nearest3[[2]][order(nearest3[[2]][,3], decreasing=T),3][3]

nearest4 <- nn2(data=t(ex), k=4)
plot(nearest4[[2]][order(nearest4[[2]][,4], decreasing=T),4], type="l", axes=F,
     ylab="3-dist measure", xlab="points")
axis(2)
axis(1, at=seq(1, 91, by=1))
abline(v=3, col="red")
nearest4[[2]][order(nearest4[[2]][,4], decreasing=T),4][3]

nearest5 <- nn2(data=t(ex), k=5)
plot(nearest5[[2]][order(nearest5[[2]][,5], decreasing=T),5], type="l", axes=F,
     ylab="4-dist measure", xlab="points")
axis(2)
axis(1, at=seq(1, 91, by=1))
abline(v=5, col="red")
nearest5[[2]][order(nearest5[[2]][,5], decreasing=T),5][5]

nearest6 <- nn2(data=t(ex), k=6)
plot(nearest6[[2]][order(nearest6[[2]][,6], decreasing=T),6], type="l", axes=F,
     ylab="5-dist measure", xlab="points")
axis(2)
axis(1, at=seq(1, 91, by=1))
abline(v=5, col="red")
nearest6[[2]][order(nearest6[[2]][,6], decreasing=T),6][5]

nearest7 <- nn2(data=t(ex), k=7)
plot(nearest7[[2]][order(nearest7[[2]][,7], decreasing=T),7], type="l", axes=F,
     ylab="6-dist measure", xlab="points")
axis(2)
axis(1, at=seq(1, 91, by=1))
abline(v=4, col="red")
nearest7[[2]][order(nearest7[[2]][,7], decreasing=T),7][4]

nearest8 <- nn2(data=t(ex), k=8)
plot(nearest8[[2]][order(nearest8[[2]][,8], decreasing=T),8], type="l", axes=F,
     ylab="7-dist measure", xlab="points")
axis(2)
axis(1, at=seq(1, 91, by=1))
abline(v=4, col="red")
nearest8[[2]][order(nearest8[[2]][,8], decreasing=T),8][4]

db4=dbscan(t(ex), eps=nearest5[[2]][order(nearest5[[2]][,5], decreasing=T),5][3], MinPts=4)$cluster
db5=dbscan(t(ex), eps=nearest6[[2]][order(nearest6[[2]][,6], decreasing=T),6][5], MinPts=5)$cluster
db6=dbscan(t(ex), eps=nearest7[[2]][order(nearest7[[2]][,7], decreasing=T),7][5], MinPts=6)$cluster
db7=dbscan(t(ex), eps=nearest8[[2]][order(nearest8[[2]][,8], decreasing=T),8][4], MinPts=7)$cluster

#### Heatmap ####

# relabeling for graphical purposes

cc=CrossClustering::cc_get_cluster(CC_euc)[order(true_part)]
true<-true_part[order(true_part)]+1
ward<-members_euc.ward[,which.max(Sil_euc.ward)][order(true_part)]+1
comp<-members_euc.comp[,which.max(Sil_euc.complete)][order(true_part)]+1
km<-lista_km[[which.max(sapply(lista_Silkm, max))]][,sapply(lista_Silkm, which.max)[which.max(sapply(lista_Silkm, max))]][order(true_part)]+1
somme<-som[,which.max(sil_som)][order(true_part)]+1
db<-db4+1

hm <- matrix(NA, nrow=n, ncol=7)
colnames(hm) <- c("Ward", "CL", "K-means", "SOM", "DBSCAN", "CC", "Membership")
rownames(hm) <- paste("Elem", 1:n, sep="")
hm[,1] <- ward
hm[,2] <- comp
hm[,3] <- km
hm[,4] <- somme
hm[,5] <- db
hm[,6] <- cc
hm[,7] <- true

library(fields)
my_palette <- colorRampPalette(c("white", "yellow", "green", "tomato", "orangered", "red", "darkmagenta", "deeppink", "purple", "blue", "royalblue", "cadetblue", "chartreuse", "dimgray", "lightseagreen"))(n = 20)
par(mar=c(5,4.5,4,7))
image(hm, col=my_palette, axes=F, xlab="Samples")
mtext(text=colnames(hm), side=2, line=0.3, at=seq(0,1,0.16), las=1, cex=0.65)
image.plot(hm, col=my_palette, legend.only=T,
           legend.lab="Cluster Membership",
           lab.breaks=c("Outliers", "1", "2", "3", "4"))
title("Breast cancer data")